Skip to content

Conversation

nhaehnle
Copy link
Collaborator

Let's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code.

(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)

@llvmbot
Copy link
Member

llvmbot commented Aug 20, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Nicolai Hähnle (nhaehnle)

Changes

Let's do the lowering of non-split into split barriers in a new IR pass, AMDGPULowerIntrinsics. That way, there is no code duplication between SelectionDAG and GlobalISel. This simplifies some upcoming extensions to the code.

(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)


Patch is 29.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154648.diff

8 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.h (+11)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (-12)
  • (added) llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp (+174)
  • (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+4-1)
  • (modified) llvm/lib/Target/AMDGPU/CMakeLists.txt (+1)
  • (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (-15)
  • (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/llc-pipeline.ll (+10)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b2..e3d4670af9e14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
 ModulePass *
 createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
 ModulePass *createAMDGPULowerBufferFatPointersPass();
+FunctionPass *createAMDGPULowerIntrinsicsLegacyPass();
 FunctionPass *createSIModeRegisterPass();
 FunctionPass *createGCNPreRAOptimizationsLegacyPass();
 FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+  AMDGPULowerIntrinsicsPass(const TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  const TargetMachine &TM;
+};
+
 void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
 extern char &AMDGPUPrepareAGPRAllocLegacyID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d7..eabf729d4853a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2007,18 +2007,6 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
     }
   }
 
-  if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-    // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-    MachineBasicBlock *MBB = MI.getParent();
-    const DebugLoc &DL = MI.getDebugLoc();
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
-        .addImm(AMDGPU::Barrier::WORKGROUP);
-    MI.eraseFromParent();
-    return true;
-  }
-
   return selectImpl(MI, *CoverageInfo);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000000000..fe70edb9d18f4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,174 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+  Function &F;
+  const AMDGPUTargetMachine &TM;
+  const GCNSubtarget &ST;
+  DominatorTree *DT = nullptr;
+  PostDominatorTree *PDT = nullptr;
+  LoopInfo *LI = nullptr;
+
+  AMDGPULowerIntrinsicsImpl(Function &F, const AMDGPUTargetMachine &TM)
+      : F(F), TM(TM), ST(TM.getSubtarget<GCNSubtarget>(F)) {}
+
+  bool run();
+
+private:
+  bool visit(Instruction *I);
+  void splitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public FunctionPass {
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsicsLegacy() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+};
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+  if (!ST.hasSplitBarriers())
+    return false;
+
+  bool Changed = false;
+
+  for (auto BBIt = F.begin(); BBIt != F.end(); ++BBIt) {
+    for (auto IIt = BBIt->begin(); IIt != BBIt->end();) {
+      Instruction *I = &*IIt;
+      ++IIt; // early increment in case the instruction is erased
+      if (I->isTerminator())
+        continue;
+
+      Changed |= visit(I);
+
+      // Fixup the basic block iterator in case the CFG was changed.
+      BBIt = IIt->getParent()->getIterator();
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsicsImpl::visit(Instruction *I) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::amdgcn_s_barrier:
+      if (ST.hasSplitBarriers()) {
+        splitBarrier(*II);
+        return true;
+      }
+      return false;
+    default:
+      return false;
+    }
+  }
+
+  return false;
+}
+
+// Lower s_{cluster_}barrier to a sequence of split barrier intrinsics.
+void AMDGPULowerIntrinsicsImpl::splitBarrier(IntrinsicInst &I) {
+  assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
+
+  IRBuilder<> B(&I);
+  unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+  bool IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+
+  if (IsSingleWaveWG) {
+    // The workgroup fits in a single wave. Downgrade to a wave barrier.
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+  } else {
+    // Lower to split barriers.
+    Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+    Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+                      {BarrierID_32});
+    B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+                      {BarrierID_16});
+  }
+
+  I.eraseFromParent();
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  AMDGPULowerIntrinsicsImpl Impl(F,
+                                 static_cast<const AMDGPUTargetMachine &>(TM));
+  Impl.DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  Impl.PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+  Impl.LI = AM.getCachedResult<LoopAnalysis>(F);
+  if (!Impl.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnFunction(Function &F) {
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+  AMDGPULowerIntrinsicsImpl Impl(F, TM);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  Impl.PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+  Impl.LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+  return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+                    false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+FunctionPass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+  return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e393aa1987744..d9d06d6814386 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -538,6 +538,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
   initializeAMDGPULowerModuleLDSLegacyPass(*PR);
   initializeAMDGPULowerBufferFatPointersPass(*PR);
+  initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
   initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
   initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1379,6 +1380,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
     // nodes out of the graph, which leads to function-level passes not
     // being run on them, which causes crashes in the resource usage analysis).
     addPass(createAMDGPULowerBufferFatPointersPass());
+    addPass(createAMDGPULowerIntrinsicsLegacyPass());
     // In accordance with the above FIXME, manually force all the
     // function-level passes into a CGSCCPassManager.
     addPass(new DummyCGSCCPass());
@@ -2116,9 +2118,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
   // nodes out of the graph, which leads to function-level passes not
   // being run on them, which causes crashes in the resource usage analysis).
   addPass(AMDGPULowerBufferFatPointersPass(TM));
-
   addPass.requireCGSCCOrder();
 
+  addPass(AMDGPULowerIntrinsicsPass(TM));
+
   Base::addCodeGenPrepare(addPass);
 
   if (isPassEnabled(EnableLoadStoreVectorizer))
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..619ff4e5c73c4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUImageIntrinsicOptimizer.cpp
   AMDGPULibFunc.cpp
   AMDGPULowerBufferFatPointers.cpp
+  AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 561019bb65549..19e93742633ca 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10439,21 +10439,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       }
     }
 
-    if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
-      // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
-      SDValue K =
-          DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
-      SDValue BarSignal =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
-                                     MVT::Other, K, Op.getOperand(0)),
-                  0);
-      SDValue BarWait =
-          SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
-                                     BarSignal.getValue(0)),
-                  0);
-      return BarWait;
-    }
-
     return SDValue();
   };
 
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index ceed41f3ed7c5..460e69622a652 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
 ; RUN:   | FileCheck -check-prefix=GCN-O3 %s
 
 
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
 
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(AMDGPULowerIntrinsicsPass,codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-m...
[truncated]

Comment on lines -2011 to -2018
// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
.addImm(AMDGPU::Barrier::WORKGROUP);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
.addImm(AMDGPU::Barrier::WORKGROUP);
MI.eraseFromParent();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't so much code that duplicating between globalisel and selection dag is a problem. Is a whole pass really needed for this one tiny case? Can we get this into one of the existing IR lowering passes (e.g. PreISelIntrinsicLowering)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PreISelIntrinsicLowering doesn't handle target-specific intrinsics. So is it better to teach that pass how to call into target hooks to handle target intrinsics, or to have a target-specific pass like in this patch?

Copy link
Collaborator Author

@nhaehnle nhaehnle Aug 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've looked at this. We could perhaps add a hook to TargetTransformInfo, but it feels a bit out of place. PreISelIntrinsicLowering iterates over the functions of the module and looks at calls from there, which means that hooking anything into that pass would basically only re-use this loop over the functions of the module.

I can sort of see the point about breaking the module pass manager (although -- isn't it really the other way around? Breaking up a function pass manager hurts, but a function pass inside a module pass manager ought to be fairly unproblematic), and perhaps a larger point that iterating over all code just for this is a bit excessive.

So I'm going to change this into a module pass that iterates like PreISelIntrinsicLowering.

As for how much code this is: I ran into this precisely because it's about to become a whole lot more code in the near future. This change is simply trying to prepare upstream for that.

@nhaehnle nhaehnle force-pushed the pub-split-barriers branch from fb93c7e to 773769f Compare August 22, 2025 00:24
Copy link

github-actions bot commented Aug 22, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@nhaehnle nhaehnle force-pushed the pub-split-barriers branch 3 times, most recently from 49f1318 to 6727443 Compare August 22, 2025 16:12
@nhaehnle
Copy link
Collaborator Author

Addressed the review comments and rebased on main to fix a build issue.

@nhaehnle
Copy link
Collaborator Author

Gentle ping

Let's do the lowering of non-split into split barriers and the
downgrading of barriers based on the workgroup size in a new IR pass,
AMDGPULowerIntrinsics. That way, there is no code duplication between
SelectionDAG and GlobalISel. This simplifies some upcoming extensions to
the code.

v2:
- turn into a Module pass
- also handle the downgrading of barriers for single-wave workgroups in
  the IR pass
- add tests for the new pass

(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)
@nhaehnle nhaehnle force-pushed the pub-split-barriers branch from 6727443 to de0e91f Compare August 26, 2025 19:05
Copy link
Collaborator

@rampitec rampitec left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@nhaehnle nhaehnle merged commit 353b5e4 into llvm:main Aug 28, 2025
9 checks passed
@nhaehnle nhaehnle deleted the pub-split-barriers branch August 28, 2025 14:02
@arsenm
Copy link
Contributor

arsenm commented Aug 28, 2025

I don't think this justifies a new pass and I'd rather just duplicate the code between the selectors. If you really wanted the code savings it would make more sense as an isel pseudo

@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 28, 2025

LLVM Buildbot has detected a new failure on builder sanitizer-x86_64-linux-android running on sanitizer-buildbot-android while building llvm at step 2 "annotate".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/11973

Here is the relevant piece of the build log for the reference
Step 2 (annotate) failure: 'python ../sanitizer_buildbot/sanitizers/zorg/buildbot/builders/sanitizers/buildbot_selector.py' (failure)
...
[       OK ] AddressSanitizer.AtoiAndFriendsOOBTest (2239 ms)
[ RUN      ] AddressSanitizer.HasFeatureAddressSanitizerTest
[       OK ] AddressSanitizer.HasFeatureAddressSanitizerTest (0 ms)
[ RUN      ] AddressSanitizer.CallocReturnsZeroMem
[       OK ] AddressSanitizer.CallocReturnsZeroMem (14 ms)
[ DISABLED ] AddressSanitizer.DISABLED_TSDTest
[ RUN      ] AddressSanitizer.IgnoreTest
[       OK ] AddressSanitizer.IgnoreTest (0 ms)
[ RUN      ] AddressSanitizer.SignalTest
[       OK ] AddressSanitizer.SignalTest (197 ms)
[ RUN      ] AddressSanitizer.ReallocTest
[       OK ] AddressSanitizer.ReallocTest (25 ms)
[ RUN      ] AddressSanitizer.WrongFreeTest
[       OK ] AddressSanitizer.WrongFreeTest (100 ms)
[ RUN      ] AddressSanitizer.LongJmpTest
[       OK ] AddressSanitizer.LongJmpTest (0 ms)
[ RUN      ] AddressSanitizer.ThreadStackReuseTest
[       OK ] AddressSanitizer.ThreadStackReuseTest (11 ms)
[ DISABLED ] AddressSanitizer.DISABLED_MemIntrinsicUnalignedAccessTest
[ DISABLED ] AddressSanitizer.DISABLED_LargeFunctionSymbolizeTest
[ DISABLED ] AddressSanitizer.DISABLED_MallocFreeUnwindAndSymbolizeTest
[ RUN      ] AddressSanitizer.UseThenFreeThenUseTest
[       OK ] AddressSanitizer.UseThenFreeThenUseTest (123 ms)
[ RUN      ] AddressSanitizer.FileNameInGlobalReportTest
[       OK ] AddressSanitizer.FileNameInGlobalReportTest (116 ms)
[ DISABLED ] AddressSanitizer.DISABLED_StressStackReuseAndExceptionsTest
[ RUN      ] AddressSanitizer.MlockTest
[       OK ] AddressSanitizer.MlockTest (0 ms)
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadedTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowIn
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowLeft
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowRight
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFHigh
[ DISABLED ] AddressSanitizer.DISABLED_DemoOOM
[ DISABLED ] AddressSanitizer.DISABLED_DemoDoubleFreeTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoNullDerefTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoFunctionStaticTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoTooMuchMemoryTest
[ RUN      ] AddressSanitizer.LongDoubleNegativeTest
[       OK ] AddressSanitizer.LongDoubleNegativeTest (0 ms)
[----------] 19 tests from AddressSanitizer (27916 ms total)

[----------] Global test environment tear-down
[==========] 22 tests from 2 test suites ran. (27920 ms total)
[  PASSED  ] 22 tests.

  YOU HAVE 1 DISABLED TEST

Step 24 (run instrumented asan tests [aarch64/aosp_coral-userdebug/AOSP.MASTER]) failure: run instrumented asan tests [aarch64/aosp_coral-userdebug/AOSP.MASTER] (failure)
...
[ RUN      ] AddressSanitizer.HasFeatureAddressSanitizerTest
[       OK ] AddressSanitizer.HasFeatureAddressSanitizerTest (0 ms)
[ RUN      ] AddressSanitizer.CallocReturnsZeroMem
[       OK ] AddressSanitizer.CallocReturnsZeroMem (8 ms)
[ DISABLED ] AddressSanitizer.DISABLED_TSDTest
[ RUN      ] AddressSanitizer.IgnoreTest
[       OK ] AddressSanitizer.IgnoreTest (0 ms)
[ RUN      ] AddressSanitizer.SignalTest
[       OK ] AddressSanitizer.SignalTest (325 ms)
[ RUN      ] AddressSanitizer.ReallocTest
[       OK ] AddressSanitizer.ReallocTest (24 ms)
[ RUN      ] AddressSanitizer.WrongFreeTest
[       OK ] AddressSanitizer.WrongFreeTest (248 ms)
[ RUN      ] AddressSanitizer.LongJmpTest
[       OK ] AddressSanitizer.LongJmpTest (0 ms)
[ RUN      ] AddressSanitizer.ThreadStackReuseTest
[       OK ] AddressSanitizer.ThreadStackReuseTest (16 ms)
[ DISABLED ] AddressSanitizer.DISABLED_MemIntrinsicUnalignedAccessTest
[ DISABLED ] AddressSanitizer.DISABLED_LargeFunctionSymbolizeTest
[ DISABLED ] AddressSanitizer.DISABLED_MallocFreeUnwindAndSymbolizeTest
[ RUN      ] AddressSanitizer.UseThenFreeThenUseTest
[       OK ] AddressSanitizer.UseThenFreeThenUseTest (303 ms)
[ RUN      ] AddressSanitizer.FileNameInGlobalReportTest
[       OK ] AddressSanitizer.FileNameInGlobalReportTest (298 ms)
[ DISABLED ] AddressSanitizer.DISABLED_StressStackReuseAndExceptionsTest
[ RUN      ] AddressSanitizer.MlockTest
[       OK ] AddressSanitizer.MlockTest (0 ms)
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadedTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoThreadStackTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowIn
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowLeft
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFLowRight
[ DISABLED ] AddressSanitizer.DISABLED_DemoUAFHigh
[ DISABLED ] AddressSanitizer.DISABLED_DemoOOM
[ DISABLED ] AddressSanitizer.DISABLED_DemoDoubleFreeTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoNullDerefTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoFunctionStaticTest
[ DISABLED ] AddressSanitizer.DISABLED_DemoTooMuchMemoryTest
[ RUN      ] AddressSanitizer.LongDoubleNegativeTest
[       OK ] AddressSanitizer.LongDoubleNegativeTest (0 ms)
[----------] 19 tests from AddressSanitizer (71678 ms total)

[----------] Global test environment tear-down
[==========] 22 tests from 2 test suites ran. (71681 ms total)
[  PASSED  ] 22 tests.

  YOU HAVE 1 DISABLED TEST

Serial 17031FQCB00176

@nhaehnle
Copy link
Collaborator Author

nhaehnle commented Sep 2, 2025

I'm not a fan of doing ever more stuff in MachineIR. The tooling around IR is better, the compiler code working on it tends to be cleaner, and it's generally better understood by more people.

In this particular case for example, we have downstream changes that introduce CFG manipulation into this code. And yeah, we have plenty of code that does that in MachineIR, but I still think it's better to do that kind of thing in IR.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

6 participants